
import excel "hpc output.xlsx", sheet("20200605 hpc output") firstrow
duplicates report fileName
duplicates report $allind
duplicates drop $allind ,force
gen id=subinstr(fileName,".txt","",.)

label data "non empty, no duplicates, text only no clause"
save "hpc7.dta",replace


use "../hpc7.dta",clear
merge 1:1 id using "../metadata.dta",gen(_merge1)

keep if _merge1==3
de,s
findname, all(@==0)
return list
drop `r(varlist)'
*drop pos_numinterjections pos_numwhpronouns syn_numconstituentspersen wn_numhypernyms
save "../metahpc7.dta",replace

use "../metahpc7.dta",clear
*ACCORDING TO TOP 100 APPLICANTS NAMING
gen uni=0
replace uni=1 if strpos(Applicants, "UNIV")
replace uni=1 if strpos(Applicants, "INST")
replace uni=1 if strpos(Applicants, "COLLEGE")
gen firm=0
replace firm=1 if strpos(Applicants, "INC")
replace firm=1 if strpos(Applicants, "LTD")
replace firm=1 if strpos(Applicants, "CORP")
replace firm=1 if strpos(Applicants, "LLC")
replace firm=1 if strpos(Applicants, "CO")
replace firm=1 if strpos(Applicants, "INNOVATION")
replace firm=1 if strpos(Applicants, "ENERGY")
replace firm=1 if strpos(Applicants, "ELECTRIC")
replace firm=1 if strpos(Applicants, "ELECTRONICS")
replace firm=1 if strpos(Applicants, "MOTOR")
replace firm=1 if strpos(Applicants, "AGENCY")
replace firm=1 if strpos(Applicants, "INDUSTRIES")
replace firm=1 if strpos(Applicants, "TELECOM")
replace firm=1 if strpos(Applicants, "SAMSUNG")
replace firm=1 if strpos(Applicants, "HON_HAI")
replace firm=1 if strpos(Applicants, "IBM")
replace firm=1 if strpos(Applicants, "DU_PONT")
replace firm=1 if strpos(Applicants, "SONY")
replace firm=1 if strpos(Applicants, "TOSHIBA")
replace firm=1 if strpos(Applicants, "ENERGIE")
replace firm=1 if strpos(Applicants, "FAN")
replace firm=1 if strpos(Applicants, "ENERGIE")
replace firm=1 if strpos(Applicants, "SHARP")
replace firm=1 if strpos(Applicants, "CANON")
replace firm=1 if strpos(Applicants, "BASF")
replace firm=1 if strpos(Applicants, "CANON")
replace firm=1 if strpos(Applicants, "BOSCH")
replace firm=1 if strpos(Applicants, "NOKIA")
replace firm=1 if strpos(Applicants, "HONGFUJIN")
replace firm=1 if strpos(Applicants, "ARKEMA")
replace firm=1 if strpos(Applicants, "SIEMENS")
replace firm=1 if strpos(Applicants, "TOYOTA")
 
gen other=0
replace other=1 if firm==0 & uni==0
gen both=0
replace both=1 if firm==1 & uni==1
replace firm=0 	if both==1
replace uni=0 	if both==1



*SUBCATEGORY
split US_Classifications,p(;;)

foreach v in 320 136 977 {
capture drop sub`v'
gen sub`v'=US_Classifications1 if substr(US_Classifications1,1,3)=="`v'"
forvalues i=2/40{
replace sub`v'=US_Classifications`i' if substr(US_Classifications`i',1,3)=="`v'"
}
replace sub`v'="0" if sub`v'==""
capture drop uspc`v'
gen uspc`v'=0
replace uspc`v'=1 if sub`v'!="0"
}

gen uspc=136 if uspc136==1
replace uspc=320 if uspc320==1
replace uspc=977 if uspc977==1

/*gen uspc=977 if strpos(US_Classifications , "977")
replace uspc=320 if strpos(US_Classifications , "320")
replace uspc=136 if strpos(US_Classifications , "136")
*/
la var firm "Firms"
la de firm 0 "Non firms" 1 "Firms"
la val firm firm
la de uni 0 "Non universities" 1 "Universities"
la val uni uni
la var both "Both Uni and Firm"
la var other "Others"


ren DISC_* D_*
ren POS_* P_*
ren SYN_* S_*

findname, any(length("@") > 22)
di "`r(varlist)'"
foreach v in `r(varlist)' {
di "`v'"
	local new = substr("`v'", 1, 22)
di "`new'"
	ren `v' `new'
}
ren NPL_Resolved_External_ NPL_Resolved_Ext_Ids
ren NPL_Resolved_Citation_ NPL_Resol_Citation_Ct
*save "../metahpc7_test.dta",replace
cd "../"
capture drop SD*
foreach v in $VAR {
egen SD`v'=std(`v')
}



dropmiss,force

gen entity=1 if uni==1
replace entity=2 if firm==1
replace entity=3 if both==1
replace entity=4 if other==1

save "../metahpc7_cleaned.dta",replace

import excel "../Lens_NumClaims_yi.xlsx", sheet("lens_ids_claim_number") firstrow clear
save "../Lens_NumClaims_yi.dta",replace

di "a few duplicates of app_id in ""../OUTDATA/Lens_firstaction112a_v2.dta""
use "../Lens_cleaned_V4.dta",clear
do  "../0_global_v1.do"


egen  semicolons=noccur(Inventors),string(";;")
tab semicolons
gen NumInventors=semicolons+1
gen year=year(Application_Date)

merge 1:1 id using "../Lens_NumClaims_yi.dta",generate(_MergeClaims)
drop if _MergeClaims==2 /*drop num claims not in the Lens sample*/
save "../OUTDATA/Lens_cleaned_V5.dta",replace



import excel "../lens-meta_V2.xlsx", sheet("lens-export") firstrow clear

ren LensID id
keep id Owners ApplicationNumber
*destring id,replace ignore("-")
*format id %23s

save ../OUTDATA/metadata_V2.dta,replace
use ../OUTDATA/metadata_V2.dta,clear
merge 1:1 id using "../Lens_cleaned_V3.dta",gen(_mergeOwner)
keep if _mergeOwner==3 /*drop extra in owner data*/

gen LicenseCorp=0 if firm==1
replace LicenseCorp=1 if strpos(Owners, "QUALCOMM")
replace LicenseCorp=1 if strpos(Owners, "INTERNATIONAL BUSINESS MACHINES")
replace LicenseCorp=1 if strpos(Owners, "HEWLETT")
replace LicenseCorp=1 if strpos(Owners, "HEWLETT-PACKARD")
sum LicenseCorp

save "../Lens_cleaned_V6_Domestic_V1.dta"
